In [5]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

### gensim, LDA model, stopwords
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS as swords
import pyLDAvis.gensim

#nltk Lemmatize and stemmer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.porter import *
np.random.seed(2018)
#nltk.download('wordnet')
#nltk.download('stopwords')
from nltk.corpus import stopwords as st
from nltk.corpus import wordnet
#nltk.download('averaged_perceptron_tagger')

### LDA visualization
import pyLDAvis
import pyLDAvis.gensim
%matplotlib inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

import matplotlib.colors as mcolors
from sklearn.manifold import TSNE
import plotly as py
import plotly.graph_objects as go
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

from sklearn.decomposition import TruncatedSVD
from scipy.sparse import random as sparse_random
from sklearn.random_projection import sparse_random_matrix
from sklearn.decomposition import PCA
import seaborn as sns
from pyclustertend import hopkins
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import completeness_score
from sklearn.metrics import homogeneity_score

1. NLP Text Data Research

Motivation

Text data has been a recent challenge for data scientists to research and analyze given its unstructured format and non-numeric feature. It is quite different from the typical numeric or categorical features we encounter more often in data analysis task. The dataset we picked is a merged data source from three data sources, because we want to ensure there are three big topics, that are somewhat different, in order for our clustering algorithm to explore.

[Note]: Alought our dataset only contains two columns, it is confirmed by instructor that it is ok to use, because text data are usually harder to clean and will eventual result in more columns.

  • 1. Kaggle Wine Review (400rows): Containing review written by professional sommelier about wines

  • 2. Kaggle Coronavirus_Tweet (400rows): Containing real posts from Twitter by real human users discussing coronavirus

  • 3. Kaggle Disater Tweet(400rows): Containing real posts from Twitter by real human users discussing disasters

Resource References

Miglani, Aman. “Coronavirus Tweets NLP - Text Classification.” Kaggle, 8 Sept. 2020, www.kaggle.com/datatattle/covid-19-nlp-text-classification.

Zackthoutt. “Wine Reviews.” Kaggle, 27 Nov. 2017,Kaggle, www.kaggle.com/zynicide/wine-reviews.

“Natural Language Processing with Disaster Tweets.” Kaggle, www.kaggle.com/c/nlp-getting-started/data.

Blei, David M., et al. “Latent Dirichlet Allocation.” Journal of Machine Learning Research, 1 Jan. 2003, https://jmlr.org/papers/volume3/blei03a/blei03a.pdf.

2. Text Precessing and EDA

Dataset preview

In [6]:
my_data = pd.read_csv('Text_Data_3_Source.csv', error_bad_lines=False)
for i in range(0,5): print(my_data.iloc[i,0])
Just happened a terrible car crash
Heard about #earthquake is different cities, stay safe everyone.
there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all
Apocalypse lighting. #Spokane #wildfires
Typhoon Soudelor kills 28 in China and Taiwan
In [7]:
print(my_data.shape)
(1200, 2)

Text Preprocessing

1. Tokenize words, remove punctuations and break down a sentence with punctuations to a list of words.

eg. "How are you doing today?" ---> ['how','are','you','doing','today']
In [8]:
data = my_data['text_content'].values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", '', sent) for sent in data]

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])
[['just', 'happened', 'terrible', 'car', 'crash']]

2. Build bigram and trigram models - words appears very often should be considered as phrases

eg: univeristy of illinois appears 500 times in our data, 'university of illinois' should be consider as one word.
In [9]:
# Build the bigram and trigram models

### min_count: words must appear together 5 times in order to form a phrase, otherwise ignored.
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

print(trigram_mod[bigram_mod[data_words[1]]])
['heard', 'about', 'earthquake', 'is', 'different', 'cities', 'stay', 'safe', 'everyone']

3. Prepare a list of stopwords - words that doesn't have any meaning

In [10]:
stop_words = st.words('english')

### extend some common case of stopwords, and stopwords from gensim package to nltk stopwords library
stop_words.extend(['from', 're'] + list(swords))

# first 5 examples
stop_words[0:5]
Out[10]:
['i', 'me', 'my', 'myself', 'we']

4. Remove stopwords, apply bigram, trigram, and lemmatization(lemmatize a word meaning conver the word to its dictionary-form)

ex: boys -> boy, playting -> play, played -> play
In [11]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize(texts):
    return [[WordNetLemmatizer().lemmatize(word, pos=get_wordnet_pos(word)) for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
In [12]:
## Apply those function to our data

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_trigrams = make_trigrams(data_words_nostops)

# lemmatize
data_lemmatized = lemmatize(data_words_trigrams)
print(data_lemmatized[:1])
[['happen', 'terrible', 'car', 'crash']]
In [13]:
## quick example of lemmatize
WordNetLemmatizer().lemmatize('forgive', pos=get_wordnet_pos('forgave'))
Out[13]:
'forgive'

Preliminary Exploratory Data Analysis

Due to text data type which is very special, in this part, we only show top 50 words for merged data, top 10 words for each topic and the number of words in each row appearing in reviews aftering text preprocessing.

In [14]:
# labels 
disater = data_lemmatized[0:400]
covid = data_lemmatized[400:800]
wine = data_lemmatized[800:]

#covid = data_lemmatized[0:400]
#wine = data_lemmatized[400:]
labels = []
for i in range(0,1200):
    if i<400: labels.append(0)
    elif i>=800: labels.append(2)
    else: labels.append(1)
In [22]:
row_count = []
for lst in range(0, len(data_lemmatized)):
    row_count.append(len(data_lemmatized[lst]))
row_count = np.array(row_count)

plt.hist(row_count, bins = 20, color = 'c', edgecolor='k')
plt.axvline(row_count.mean(), color='k', linestyle='dashed', linewidth=1)
min_ylim, max_ylim = plt.ylim()
plt.text(row_count.mean()*1.1, max_ylim*0.9, 'Mean: {:.2f}'.format(row_count.mean()))
plt.title('Distribution of Word Count in Each Row ')
plt.xlabel('#words per row')
plt.ylabel('#documents')
plt.show()
In [16]:
def top_words(data, title, i):
    x = pd.Series(data)
    df = x.apply(pd.Series).stack().reset_index(drop = True)
    all_words = df.value_counts()

    freq_word = [go.Bar(
                x = all_words.index.values[1:i+2],
                y = all_words.values[1:i+2],
                marker= dict(colorscale='Jet',
                             color = all_words.values[1:100]
                            ),
                text='Word counts'
        )]


    layout = go.Layout(
        title= title
    )
    fig = go.Figure(data=freq_word, layout=layout)
    fig.show()

Most frequent words from merged data

In [17]:
top_words(data = data_lemmatized, title='Top Words Frequencies', i=50)

Most frequent words from Disaster Tweet

In [18]:
top_words(data =disater, title='Disaster: Top 10 Words Frequencies ', i=10)

Most frequent words from Coronavirus Tweet

In [19]:
top_words(data =covid, title='Covid: Top 10 Words Frequencies', i=10)

top_words(data =wine, title='Wine: Top 10 Words Frequencies', i=10)

num of words per documents

  • index 0-400: disaster tweet - fewer words per document(row)
  • index 401-800: coronavirus tweet - intermediate words per document(row)
  • index 801-1200: wine reviews - most words per document(row)
In [46]:
n_words = []

for i in data_lemmatized:
    n_words.append(len(i))

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(range(0,1200),n_words)
plt.xlabel('#row_id')
plt.ylabel('#words')
plt.show()

3. Pre-analysis Q&A

In this part, we use TF-IDF(X1).

In [33]:
documents = []
for i in data_lemmatized: documents.append(" ".join(i))
    
# TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X1 = vectorizer.fit_transform(documents) # sparse matrix

Is the dataset clusterable?

We use PCA method to reduce the sparse matrix dimensionality to 50D and then apply hopkins method to if the data is clusterable. Hopkins statistics are really near to 0, so our data is clusterable.

In [34]:
X1_pca_50 = PCA(n_components=50, random_state=430).fit_transform(X1.todense())
In [35]:
num_trials=5
hopkins_stats=[]
for i in range(0,num_trials):
    n = len(X1_pca_50)
    p = int(0.1 * n)
    hopkins_stats.append(hopkins(X1_pca_50, p))
print(hopkins_stats)
[0.15158633980208316, 0.15126865808557616, 0.15777218086862, 0.14470094935039124, 0.1519411427802623]

how many underlying clusters does the data have?

  • We use TSNE and PCA to reduce dim, plot 'x' and 'y' and then find that there are potential 2 or 3 clusters.
  • We choose 3 clusters.
In [36]:
# TSNE 
tsne_tf = TSNE(random_state=430).fit_transform(PCA(n_components=50, random_state=430).fit_transform(X1.todense()))
sns.scatterplot(x=tsne_tf[:,0],y=tsne_tf[:,1], hue=labels)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('TSNE Plot')
plt.show()

What are the shapes of the underlying clusters?

Hard to describe, because this is text data, not numeric data.

Are the clusters balanced in size?

Yes, from the plot, the clusters are indeed balanced in size. Furthermore, we intentionally make it balance - 400 rows from each data source to ensure balanced cluster.

Do any of the clusters that you identified overlap with each other?

From the plot, there is overlap in two of the clusters - probably coronavirus tweet and disaster tweet. Coronavirus tweets may have something in common with disaster tweets, given coronavirus is also a disaster.

4. Algorithm Motivation

The intial motivation of this analysis is to evaluate

  • 1.A topic modeling(text clustering) algorithm's performance (like Latent Dirichlet Allocation(LDA))
  • 2.Compare with more ordinal clustering algorithm like Kmeans
  • We are interested to see what additional information we can gain from LDA, which is a algorithm designed for topic exploration

5. Run Two Algorithms

Prepare to run LDA on cleaned text data

In [37]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)  

#Corpus
texts = data_lemmatized 

# Term Document Frequency (bag of words)
corpus = [id2word.doc2bow(text) for text in texts]

print('LDA Model-read version example:',corpus[:1])
print('human-read version example:',[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])
LDA Model-read version example: [[(0, 1), (1, 1), (2, 1), (3, 1)]]
human-read version example: [[('car', 1), ('crash', 1), ('happen', 1), ('terrible', 1)]]

How many clustering(topics) we should expect? - Let's use coherence score as our 'LDA elbow method' to tell us

In [38]:
# How many clustering(topics) we should expect? - Let's use coherence score as our LDA-elbow method to tell us
def compute_coherence_values(dictionary, corpus, texts, limit, start=1, step=1):

    coherence_values = []
    model_list = []
    perplexity = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=430,
                                           update_every=1,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        perplexity.append(model.log_perplexity(corpus))
    return model_list, coherence_values,perplexity
In [39]:
model_list, coherence_values, perplexity = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=1, limit=6, step=1)
     
In [40]:
###### Show graph
limit=6; start=1; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
In [41]:
###### Show graph
limit=6; start=1; step=1;
x = range(start, limit, step)
plt.plot(x, perplexity)
plt.xlabel("Num Topics")
plt.ylabel("Perplexity")
plt.legend(("Perplexity"), loc='best')
plt.show()

Coherence score is high at #topic = 2 and 3, but perplexity at #topic = 2 is high, thus we will pick #topic = 3

  • Note that lower perplexity indicate the model is less confuses, higher coherence score meaning words within the same topic are more relavent. Consider this as Separation and Cohesion in normal clustering.
In [78]:
LDA_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=123,
                                           update_every=1,
                                           passes=80,
                                           alpha='auto',
                                           per_word_topics=True)
In [73]:
LDA_model.print_topics()
Out[73]:
[(0,
  '0.027*"http" + 0.015*"covid" + 0.014*"coronavirus" + 0.010*"food" + 0.008*"people" + 0.007*"amp" + 0.006*"need" + 0.006*"store" + 0.005*"stock" + 0.005*"supermarket"'),
 (1,
  '0.018*"wine" + 0.015*"flavor" + 0.013*"http" + 0.012*"aroma" + 0.011*"fruit" + 0.010*"finish" + 0.009*"palate" + 0.008*"ripe" + 0.007*"drink" + 0.006*"tannin"'),
 (2,
  '0.024*"http" + 0.011*"covid" + 0.007*"coronavirus" + 0.006*"food" + 0.004*"like" + 0.004*"consumer" + 0.003*"amp" + 0.003*"flavor" + 0.003*"grocery_store" + 0.003*"stock"')]

It's hard to read, let's view it as word cloud

In [68]:
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = LDA_model.show_topics(formatted=False)

fig, axes = plt.subplots(1, 3, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

It's clear, by reading the key words of each topic, I can draw the following conclusion:

    1. Topic 0 is definitely about wine review
    1. Topic 2 and topic 3 are probably mixture of coronavirus tweet, and disaster tweet. We are seeing some overlapping here.

Verify cluster distance and overlapping in pyLDAvis

In [81]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(LDA_model, corpus, dictionary=LDA_model.id2word)
vis
Out[81]:

Determine Dominant Topic

In [79]:
def format_topics_sentences(ldamodel=None, corpus=corpus):
    sent_topics_df = pd.DataFrame()

    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=LDA_model, corpus=corpus)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords',]
df_dominant_topic.head(5)
Out[79]:
Document_No Dominant_Topic Topic_Perc_Contrib Keywords
0 0 0.0 0.9851 http, covid, coronavirus, food, people, amp, s...
1 1 1.0 0.9867 http, covid, annihilation, attack, like, amp, ...
2 2 1.0 0.9886 http, covid, annihilation, attack, like, amp, ...
3 3 1.0 0.9802 http, covid, annihilation, attack, like, amp, ...
4 4 0.0 0.9880 http, covid, coronavirus, food, people, amp, s...

Kmeans

Kmeans with TF-IDF

In [82]:
# elbow plot 
cluster_num_list=range(1,6)
avg_inertia_list=[]
for k in cluster_num_list:
    sub_inertia_list=[]
    for i in range(0,3):
        kmeans=KMeans(n_clusters=k, init='k-means++',).fit(X1)
        sub_inertia_list.append(kmeans.inertia_)
    avg_inertia_list.append(np.average(sub_inertia_list))
In [83]:
#Plot it    
plt.plot(cluster_num_list,avg_inertia_list)
plt.xlabel('Number of Clusters Requested in K-means')
plt.ylabel('Average Inertia of the K-Means Results (3 trials)')
plt.title('Elbow Method Results')
plt.show()
  • From the elbow plot, we k=2 or k=3 are both fine. Therefore, we choose k=3.
In [84]:
k = 3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, random_state = 1000).fit(X1)
pred_tfidf = kmeans.labels_
In [85]:
# TSNE 
tsne_tf = TSNE(random_state=430).fit_transform(PCA(n_components=50, random_state=430).fit_transform(X1.todense()))
In [86]:
ax = sns.scatterplot(x=tsne_tf[:,0],y=tsne_tf[:,1],sizes=(30, 400), hue=pred_tfidf)
ax.set_title('TSNE Plot Colored by Predictions')
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y$')
plt.show()
  • From the tsne plot hued by predictions, two clusters are mixed and one cluster is independent.
  • We can see from the Top terms of the clusters, cluster 0 and cluster 1 are mixed(disaster and covid) and cluster 2 is wine.
In [87]:
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
print("Top terms per cluster:")
for i in range(k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print("\n")
Top terms per cluster:
Cluster 0:
 http
 covid
 coronavirus
 consumer
 annihilation
 salt_river
 wild_horse
 arson
 store
 attack


Cluster 1:
 covid
 food
 people
 coronavirus
 amp
 stock
 need
 supermarket
 blew
 like


Cluster 2:
 wine
 flavor
 fruit
 aroma
 finish
 ripe
 palate
 drink
 note
 tannin


K-means with Bag-of-Words

In [89]:
vectorizer2 = CountVectorizer(strip_accents='unicode', stop_words='english')
X2 = vectorizer2.fit_transform(documents)
X2_new = PCA(n_components=50, random_state=430).fit_transform(X2.todense())
In [90]:
# elbow plot 
cluster_num_list=range(1,6)
avg_inertia_list=[]
for k in cluster_num_list:
    sub_inertia_list=[]
    for i in range(0,3):
        kmeans=KMeans(n_clusters=k, init='k-means++',).fit(X2)
        sub_inertia_list.append(kmeans.inertia_)
    avg_inertia_list.append(np.average(sub_inertia_list))
  • From the elbow plot, we k=2 or k=3 are both fine. Therefore, we choose k=3.
In [91]:
k = 3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, random_state = 430)
kmeans.fit(X2)
pred_bw = kmeans.labels_
In [92]:
# TSNE 
tsne_bw = TSNE(random_state=430).fit_transform(PCA(n_components=50, random_state=430).fit_transform(X2.todense()))
In [93]:
ax = sns.scatterplot(x=tsne_bw[:,0],y=tsne_bw[:,1],sizes=(30, 400), hue=pred_bw)
ax.set_title('TSNE Plot Colored by Predictions')
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y$')
plt.show()
In [94]:
ax = sns.scatterplot(x=tsne_bw[:,0],y=tsne_bw[:,1],sizes=(30, 400), hue=labels)
ax.set_title('TSNE Plot Colored by Real labels')
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y$')
plt.show()
  • From the tsne plot hued by predictions, there are 3 clusters, but size of one cluster is very small.
  • We can see from the Top terms of the clusters, cluster 0 and cluster 1 are mixed(disaster and covid) and cluster 2 is wine.
In [95]:
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer2.get_feature_names()
print("Top terms per cluster:")
for i in range(k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print("\n")
Top terms per cluster:
Cluster 0:
 http
 covid
 consumer
 coronavirus
 amp
 business
 pick
 fan_army
 impact
 food


Cluster 1:
 http
 covid
 coronavirus
 food
 amp
 people
 stock
 store
 supermarket
 need


Cluster 2:
 wine
 flavor
 aroma
 fruit
 finish
 palate
 ripe
 drink
 note
 tannin


6. Post-Analysis Questions

Separation and Cohesion - Silhouette Coefficient

In [99]:
print('kmeans-TF_IDF',silhouette_score(X1, pred_tfidf))
print('kmeans-Bag of Words',silhouette_score(X1, pred_bw))
kmeans-TF_IDF 0.009572647252521584
kmeans-Bag of Words 0.007067742264542513

Adjusted RAND

In [103]:
#0: random labeling independently of the number of clusters and samples
#1: clusterings are identical
print('Bag-of-words LDA by dominant topic',adjusted_rand_score(df_dominant_topic['Dominant_Topic'], labels))
print('kmeans-TF_IDF',adjusted_rand_score(pred_tfidf, labels))
print('kmeans-Bag of Words',adjusted_rand_score(pred_bw, labels))
Bag-of-words LDA by dominant topic 0.5157911934714806
kmeans-TF_IDF 0.5022294933630596
kmeans-Bag of Words 0.49177903049626587

Homogeneity

In [124]:
# each cluster contains only members of a single class
print('Bag-of-words LDA by dominant topic',homogeneity_score(df_dominant_topic['Dominant_Topic'], label))
print('kmeans-TF_IDF',homogeneity_score(pred_tfidf, labels))
print('kmeans-Bag of Words',homogeneity_score(pred_bw, labels))
Bag-of-words LDA by dominant topic 0.5413796457573575
kmeans-TF_IDF 0.577579692659428
kmeans-Bag of Words 0.6559580193658725

Completeness score

In [106]:
# all members of a given class are assigned to the same cluster
print('Bag-of-words LDA by dominant topic',completeness_score(df_dominant_topic['Dominant_Topic'], labels))
print('kmeans-TF_IDF',completeness_score(pred_tfidf, labels))
print('kmeans-Bag of Words',completeness_score(pred_bw, labels))
Bag-of-words LDA by dominant topic 0.5144442475062223
kmeans-TF_IDF 0.5774079188278962
kmeans-Bag of Words 0.5156291054169841

T-sne plot

In [121]:
# LDA
fig, ax = plt.subplots(figsize=(18, 11))
with sns.plotting_context("notebook", font_scale=1.5):
    sns.scatterplot(x=tsne_tf[:,0],y=tsne_tf[:,1],sizes=(30, 400), hue=np.array(df_dominant_topic['Dominant_Topic'].astype(int)), style=labels)
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y$')
plt.show()
In [108]:
# kmeans-TF_IDF
fig, ax = plt.subplots(figsize=(18, 11))
with sns.plotting_context("notebook", font_scale=1.5):
    sns.scatterplot(x=tsne_tf[:,0],y=tsne_tf[:,1],sizes=(30, 400), hue=pred_tfidf, style=labels)
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y$')
plt.show()
In [109]:
# kmeans-Bag of Words
fig, ax = plt.subplots(figsize=(18, 11))
with sns.plotting_context("notebook", font_scale=1.5):
    sns.scatterplot(x=tsne_tf[:,0],y=tsne_bw[:,1],sizes=(30, 400), hue=pred_bw, style=labels)
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y$')
plt.show()

7. Analysis Summary

Algorithm Comparison Summary

In [123]:
import base64, io, IPython
from PIL import Image as PILImage

image = PILImage.open('1.png')

output = io.BytesIO()
image.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()

html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
IPython.display.HTML(html)
Out[123]:

LDA is a soft clustering, in order to make the comparison at hard assignment level, here we take a dominant topic (the cluster with highest probability) in order to form the comparison.

In summary, LDA is good at recognizing the topics discussed in the documents, and provide good performance with simple text convertion technique like bag-of-words, K-means can give good performance when using TF-IDF, when using bag-of-words, the result is less interpretable than LDA.

Insights Summary

LDA advantage:

  1. easy to visualize inter-topic map, a better way to evaluate cluster distance than T-sne
  2. Soft clustering, provide a probability distribution over topics as result, easy to recognize observations that are harder to determine cluster assginment
  3. Results are more interpretable, with weights assgined to keywords in each topic(ex: Topic0 = '0.018"wine" + 0.015"flavor" + 0.013"http" + 0.012"aroma" + 0.011"fruit" + 0.010"finish")

K-means advantage:

  1. Hard cluster assignment, can result in higher Homogeneity and Completeness score
  2. Good performance when ultize TF-IDF, yield to or exceed LDA